Our goal is to find the variables within the dataset that effect SalePrice the most. We will use a linear regression model to predict SalePrice based on the variables we find.

library(tidyverse)
library(knitr)
library(mosaic)
library(plotly)
library(pander)



dark_theme <- theme(
  plot.background = element_rect(fill = "black", color = "black"),
  panel.background = element_rect(fill = "black", color = "black"),
  panel.grid.major = element_line(color = "white"),
  panel.grid.minor = element_line(color = "gray"),
  axis.text = element_text(color = "white"),
  axis.title = element_text(color = "white"),
  plot.title = element_text(color = "white"),
  plot.subtitle = element_text(color = "white"),
  plot.caption = element_text(color = "white"),
  legend.background = element_rect(fill = "gray20"),  # Dark background for the legend
  legend.text = element_text(color = "white"),  # White text in the legend
  legend.title = element_text(color = "white"),  # White title in the legend
  axis.ticks = element_line(color = "white")
)
train  <- read_csv("house-prices-advanced-regression-techniques-1/train.csv")
#View(train)
#glimpse(train)  # Gives a compact summary


train <- train %>%
  mutate( X1stFlrSF = `1stFlrSF`,
          X2ndFlrSF = `2ndFlrSF`
  )

train <- train %>%
  mutate(Alley = as.character(Alley),
         Alley = replace_na(Alley, "None"),
         Alley = as.factor(Alley)) %>%
  mutate(TotalSF = X1stFlrSF + X2ndFlrSF + TotalBsmtSF,
                RichNbrhd = case_when(Neighborhood %in% c("StoneBr", "NridgHt", "NoRidge") ~ 1,TRUE ~ 0))

#Or use mutate and create a new "TotalSF" variable that allows for a simple linear regression model that is just as powerful, but far easier to graph and interpret.

train <- train %>%
  mutate(TotalSF = X1stFlrSF + X2ndFlrSF + TotalBsmtSF)

train <- train %>%
  mutate(NeighborhoodGroup = case_when(
    Neighborhood %in% c("NridgHt", "StoneBr", "NoRidge") ~ "Expensive",
    Neighborhood %in% c("OldTown", "BrDale", "MeadowV") ~ "Cheap",
    TRUE ~ "Midrange"
  ))%>% 
  mutate( OverallQual= ifelse(OverallQual < 4, 4, OverallQual))

#correlations <- cor(train %>% select(where(is.numeric)), use = "complete.obs")
#correlations["SalePrice",] %>% sort(decreasing = TRUE)

# GarageArea

train <- train %>% mutate(GarageCars = ifelse(GarageCars > 3, 3, GarageCars))
train <- train %>% mutate(OverallCond = ifelse(OverallCond < 3, 3, OverallCond))

# ggplot(train, aes(x=YearBuilt, y=SalePrice, color=as.factor(NeighborhoodGroup))) + 
#   geom_point() +
#   facet_wrap(~OverallCond) +
#   geom_line(aes(y=lm.saunders.qual$fit))


lm.saunders.qual <-lm(SalePrice~ TotalSF +OverallQual+OverallCond+ GrLivArea + GarageCars + YearBuilt+ NeighborhoodGroup + OverallQual:TotalSF:NeighborhoodGroup +NeighborhoodGroup:TotalSF:GarageCars ,data=train)



summary_df <- broom::tidy(lm.saunders.qual)
kable(summary_df, digits = 3, caption = "Summary of lm.saunders.qual")
Summary of lm.saunders.qual
term estimate std.error statistic p.value
(Intercept) -925180.720 82299.928 -11.242 0.000
TotalSF 28.602 4.849 5.898 0.000
OverallQual 27144.563 2003.119 13.551 0.000
OverallCond 8716.731 790.348 11.029 0.000
GrLivArea 24.894 3.276 7.599 0.000
GarageCars -16123.136 4372.174 -3.688 0.000
YearBuilt 408.939 41.913 9.757 0.000
NeighborhoodGroupExpensive -103960.628 11239.209 -9.250 0.000
NeighborhoodGroupMidrange 19278.296 6042.581 3.190 0.001
TotalSF:OverallQual:NeighborhoodGroupCheap -4.870 0.964 -5.053 0.000
TotalSF:OverallQual:NeighborhoodGroupExpensive 1.545 0.787 1.963 0.050
TotalSF:OverallQual:NeighborhoodGroupMidrange -4.784 0.701 -6.823 0.000
TotalSF:GarageCars:NeighborhoodGroupCheap 13.124 2.334 5.624 0.000
TotalSF:GarageCars:NeighborhoodGroupExpensive 10.916 2.034 5.368 0.000
TotalSF:GarageCars:NeighborhoodGroupMidrange 11.929 1.739 6.859 0.000
###################################
# 
# library(rpart)
# tree_model <- rpart(SalePrice ~ TotalSF + OverallQual + Neighborhood, data = train)
# printcp(tree_model)
# 
# pruned_tree <- prune(tree_model, cp = 0.016146)
# printcp(pruned_tree)
# 
# print(pruned_tree)
# summary(pruned_tree)
# 
# 
# train <- train %>%
#   mutate(
#     HighQuality = ifelse(OverallQual >= 7.5, 1, 0),
#     LargeSF = ifelse(TotalSF >= 2474, 1, 0),
#     VeryLargeSF = ifelse(TotalSF >= 3708.5, 1, 0)
#   )
# 
# lm_model <- lm(SalePrice ~ HighQuality + LargeSF + VeryLargeSF + Neighborhood, data = train)
# summary(lm_model)
# 
# train$Neighborhood <- as.factor(train$Neighborhood)
# 
# train <- train %>%
#   mutate(NeighborhoodGroup = case_when(
#     Neighborhood %in% c("NridgHt", "StoneBr", "NoRidge") ~ "Expensive",
#     Neighborhood %in% c("OldTown", "BrDale", "MeadowV") ~ "Cheap",
#     TRUE ~ "Midrange"
#   ))%>% 
#   mutate( OverallQual= ifelse(OverallQual < 4, 4, OverallQual))
# 
# lm_model <- lm(SalePrice ~ OverallQual + TotalSF + NeighborhoodGroup+NeighborhoodGroup:OverallQual:TotalSF, data = train)
# summary(lm_model)
# 
# 
# lm.saunders.qual <-lm(SalePrice~ TotalSF +OverallQual+OverallCond+ GrLivArea + GarageCars + YearBuilt+ NeighborhoodGroup + OverallQual:TotalSF:NeighborhoodGroup:,data=train)
# summary(lm.saunders.qual)
# 
# ##################################
# 
# plot(SalePrice ~ TotalSF,col= as.factor(OverallQual), data=train)
# 
# OverallQual = ifelse(OverallQual < 4, 4, OverallQual)
# 
# 
# ggplot(train, aes(x=TotalSF, y=SalePrice, color=as.factor(NeighborhoodGroup))) + 
#   geom_point() +
#   facet_wrap(~OverallQual) +
#   geom_line(aes(y=lm_model$fit))
# 
# 
# 
# 
# 
# summary(lm.saunders.qual)
# 
# #boxCox(lm.saunders.qual, lambda = seq(-2, 2, by = 0.1))
# 
# exp(coef(lm.sqft.rich.log))
# #are the multipliers for the increase in average Sale Price.
# # So 1.000337 means each square foot makes the predicted value 1.000337 times as large. Or, 1,000 square feet of addition makes the home 40% greater in value.
# exp(coef(lm.sqft.rich.log)[2]*1000)
# 
# 
# exp(coef(lm.saunders.qual)[2]*1000)
# 
# 
# # Someone got above .8 using only rich neighborhoods, square footage and quality. What?
# 
# # Leverage = Dots that are outlines have more effect than normal dots, cooks distance tells us if a dots has too much leverage and we need a new model. 

From our model we see that our R^2 value is 0.85 which is a good value. This means that 85% of the variance in SalePrice is explained by the variables in our model. Our error bound is about $60k, which is alright. This might be comparable to a semi experienced person guessing the price of a house.

Now we will validate our model by splitting the data into a training and testing set. We will use the training set to train our model and the testing set to validate our model.

set.seed(121)

num_rows <- 1000 #1460 total
keep <- sample(1:nrow(train), num_rows)

lm.saunders.qual <-lm(SalePrice~ TotalSF +OverallQual+OverallCond+ GrLivArea + GarageCars + YearBuilt+ NeighborhoodGroup + OverallQual:TotalSF:NeighborhoodGroup +NeighborhoodGroup:TotalSF:GarageCars ,data=train)


mytrain <- train[keep, ] #Use this in the lm(..., data=mytrain) it is like "rbdata"

mytest <- train[-keep, ] #Use this in the predict(..., newdata=mytest) it is like "rbdata2"

lm.saunders.qual <-lm(SalePrice~ TotalSF +OverallQual+OverallCond+ GrLivArea + GarageCars + YearBuilt+ NeighborhoodGroup + OverallQual:TotalSF:NeighborhoodGroup +NeighborhoodGroup:TotalSF:GarageCars ,data=mytrain)

#OverallCond, YearBuilt,GrLivArea

predictions <- predict(lm.saunders.qual, newdata = mytest)



#Compute SSTO 
y_true <- mytest$SalePrice
y_mean <- mean(y_true)
sst <- sum((y_true - y_mean)^2)  # SSTO

#Compute SSE (Sum of Squared Errors)
sse <- sum((y_true - predictions)^2)

#Compute R² manually
r2_manual <- 1 - (sse / sst)


library(tibble)

r2_table <- tibble(
  Metric = "Manually Computed Validation R²",
  Value = round(r2_manual, 4)
)

knitr::kable(r2_table, caption = "Model Evaluation Metric")
Model Evaluation Metric
Metric Value
Manually Computed Validation R² 0.7934

Our manually computed validation R^2 value is 0.7934 which is great.

Now we will graph all of the variables interaction with sales price along with our regression models lines to see how on track we are at predicting price.

Total SF and Overall Quality vs Sale Price

lm.saunders.qual <-lm(SalePrice~ TotalSF +OverallQual+OverallCond+ GrLivArea + GarageCars + YearBuilt+ NeighborhoodGroup+ OverallQual:TotalSF:NeighborhoodGroup +NeighborhoodGroup:TotalSF:GarageCars ,data=train)

b <- coef(lm.saunders.qual)




train$predicted <- predict(lm.saunders.qual)

ggplot(train, aes(x = predicted, y = SalePrice)) +
  geom_point(alpha = 0.5, color = "purple") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed",color="green") +
  theme_minimal() +
  labs(title = "Predicted vs Actual SalePrice")+
  dark_theme

We can see that our model does pretty well! It seems that we are having a harder time as the price goes up becaus ethe residual variance is increasing.

OverallQual : TotalSF: NeighborhoodGroup plot

library(ggplot2)
library(dplyr)

# Choose a few distinct levels of OverallQual to plot (e.g., 4, 6, 8)
plot_data <- mytrain %>%
  filter(OverallQual %in% c(4, 6, 8)) %>%
  mutate(OverallQual = factor(OverallQual))

ggplot(plot_data, aes(x = TotalSF, y = SalePrice, color = OverallQual)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE) +
  facet_wrap(~ NeighborhoodGroup) +
  labs(title = "Interaction between TotalSF, OverallQual, and NeighborhoodGroup",
       x = "Total Square Footage",
       y = "Sale Price",
       color = "Overall Qual") +
  theme_minimal()+ dark_theme

As we can see there is a clear relationship between these variables. When we split them up by NeighborhoodGroup we can see that the Expensive group has a much higher price than the Midrange and Cheap groups. So, making the interaction term gives our model more accuracy.

3-way interaction plot TotalSF - GarageCars - NeighborhoodGroup

plot_ly(train, x = ~TotalSF, y = ~GarageCars, z = ~SalePrice, 
        color = ~NeighborhoodGroup, 
        colors = c("cyan", "blue", "purple"),
        type = "scatter3d", 
        marker = list(size = 4)) %>%
  layout(
    scene = list(
      xaxis = list(
        backgroundcolor = "black",
        gridcolor = "gray",
        zerolinecolor = "gray",
        color = "white"
      ),
      yaxis = list(
        backgroundcolor = "black",
        gridcolor = "gray",
        zerolinecolor = "gray",
        color = "white"
      ),
      zaxis = list(
        backgroundcolor = "black",
        gridcolor = "gray",
        zerolinecolor = "gray",
        color = "white"
      ),
      bgcolor = "black"
    ),
    paper_bgcolor = "black",
    plot_bgcolor = "black"
  )

We can see that the houses with more places for cars are more expensive. This is a good sign that our model is working well. They also have a different slope for square footage witch is interesting.

Total SF and Neighborhood Group vs Sale Price

library(ggplot2)

# Plot TotalSF vs NeighborhoodGroup
ggplot(train, aes(x = TotalSF, y = SalePrice, color = NeighborhoodGroup)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
  facet_wrap(~ NeighborhoodGroup) +
  theme_minimal() +
  labs(title = "TotalSF x NeighborhoodGroup with SalePrice")+dark_theme

We can see that the Expensive group has a much higher price than the Midrange and Cheap groups. So, making the interaction term gives our model more accuracy.

Total SF and Overall Quality vs Sale Price

ggplot(train, aes(x = TotalSF, y = SalePrice, color = as.factor(OverallQual))) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
  facet_wrap(~ OverallQual) +
  theme_minimal() +
  labs(title = "OverallQual x TotalSF with SalePrice")+dark_theme

As we can see there are different slopes hidden within the different OverallQual groups. This is a good sign that our model is working well.

library(ggplot2)


# One plot per predictor
ggplot(mytrain, aes(x = OverallCond, y = SalePrice)) +
  geom_point(alpha = 0.4,size=1.7, color="yellow") +
  geom_smooth(method = "lm", se = TRUE, color = "blue") +
  labs(title = "Effect of OverallCond on SalePrice") + dark_theme

ggplot(mytrain, aes(x = YearBuilt, y = SalePrice)) +
  geom_point(alpha = 0.4,size=1.7, color="green") +
  geom_smooth(method = "lm", se = TRUE, color = "blue") +
  labs(title = "Effect of YearBuilt on SalePrice")+ dark_theme

ggplot(mytrain, aes(x = GrLivArea, y = SalePrice)) +
  geom_point(alpha = 0.4,size=1.7, color="purple") +
  geom_smooth(method = "lm", se = TRUE, color = "blue") +
  labs(title = "Effect of GrLivArea on SalePrice")+ dark_theme

Here are our last 3 Varaibles influence on SalePrice. We can see that OverallCond has a very weak relationship with SalePrice. YearBuilt and GrLivArea have a much stronger relationship with SalePrice.

Our model is pretty good and explains a lot of variance in otherwise hard to parse through data.